{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "21bfcfd6-f181-4954-b580-eacd37a539bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers.document_compressors import LLMChainExtractor\n",
    "from langchain.retrievers import ContextualCompressionRetriever\n",
    "from langchain.chains import RetrievalQA\n",
    "from langchain_community.chat_models import ChatZhipuAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e8e99c9e-aeb3-4cd8-ac2f-34b3c6929327",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\9N3GPC3\\AppData\\Local\\Temp\\ipykernel_15212\\2810499957.py:12: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the langchain-huggingface package and should be used instead. To use it run `pip install -U langchain-huggingface` and import as `from langchain_huggingface import HuggingFaceEmbeddings`.\n",
      "  hf_embedding = HuggingFaceEmbeddings(\n",
      "D:\\python311\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
      "  from tqdm.autonotebook import tqdm, trange\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
    "from langchain_community.vectorstores import FAISS\n",
    "import faiss\n",
    "from uuid import uuid4\n",
    "from langchain_core.documents import Document\n",
    "\n",
    "model_name =  \"../../bge-small-zh-v1.5\"\n",
    "model_kwargs = {'device': 'cpu'}\n",
    "encode_kwargs = {'normalize_embeddings': True}\n",
    "\n",
    "hf_embedding = HuggingFaceEmbeddings(\n",
    "    model_name=model_name,\n",
    "    model_kwargs=model_kwargs,\n",
    "    encode_kwargs=encode_kwargs\n",
    ")\n",
    "\n",
    "\n",
    "index = faiss.IndexFlatIP(len(hf_embedding.embed_query(\"hello world\")))\n",
    "vector_store = FAISS(\n",
    "    embedding_function=hf_embedding,\n",
    "    index=index,\n",
    "    docstore=InMemoryDocstore(),\n",
    "    index_to_docstore_id={},\n",
    ")\n",
    "\n",
    "text = \"\"\"蔚来萤火虫即将交付，李斌所预测的两年内新能源车渗透率超80%能否实现？\n",
    "\n",
    "9 月 6 日消息，近日，蔚来汽车创始人、董事长李斌在蔚来九周年内部讲话及财报电话会上透露了多项重要信息，其中最引人注目的莫过于第三品牌萤火虫将于 2025 年正式交付，并预测新能源汽车的市场渗透率将在未来两年内超过 80%。\n",
    "\n",
    "据李斌介绍，蔚来汽车将形成三个品牌矩阵，覆盖从 14 万元到 80 万元的广阔市场区间。其中，第三品牌萤火虫作为蔚来汽车布局中低端市场的重要棋子，将于 2025 年正式交付。这一举措不仅丰富了蔚来的产品线，也进一步提升了其在新能源汽车市场的竞争力。\n",
    "\n",
    "This chunker works by determining when to \"break\" apart sentences. This is done by looking for differences in embeddings between any two sentences. When that difference is past some threshold, then they are split.\n",
    "\n",
    "There are a few ways to determine what that threshold is, which are controlled by the breakpoint_threshold_type kwarg.\"\"\"\n",
    "\n",
    "documents = []\n",
    "for row in text.split('\\n'):\n",
    "    if not row:\n",
    "        continue\n",
    "    documents.append(Document(\n",
    "        page_content=row,\n",
    "            metadata={}))\n",
    "    \n",
    "uuids = [str(uuid4()) for _ in range(len(documents))]\n",
    "vector_store.add_documents(documents=documents, ids=uuids)\n",
    "\n",
    "results = vector_store.similarity_search(\n",
    "    \"蔚来\",\n",
    "    k=2,\n",
    ") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c4ba5be8-6cc7-43dc-a7b0-48a0aa6d55a7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='据李斌介绍，蔚来汽车将形成三个品牌矩阵，覆盖从 14 万元到 80 万元的广阔市场区间。其中，第三品牌萤火虫作为蔚来汽车布局中低端市场的重要棋子，将于 2025 年正式交付。这一举措不仅丰富了蔚来的产品线，也进一步提升了其在新能源汽车市场的竞争力。'),\n",
       " Document(page_content='9 月 6 日消息，近日，蔚来汽车创始人、董事长李斌在蔚来九周年内部讲话及财报电话会上透露了多项重要信息，其中最引人注目的莫过于第三品牌萤火虫将于 2025 年正式交付，并预测新能源汽车的市场渗透率将在未来两年内超过 80%。')]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "62b12a8f-6a15-46ab-bc79-1d5b6749d06f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "这篇文档的主要话题是蔚来汽车及其创始人李斌在蔚来九周年内部讲话及财报电话会上透露的新信息，特别是关于蔚来汽车即将推出的第三品牌萤火虫，以及新能源汽车市场的发展预测。\n",
      "Source documents: [Document(page_content='This chunker works by determining when to \"break\" apart sentences. This is done by looking for differences in embeddings between any two sentences. When that difference is past some threshold, then they are split.'), Document(page_content='9 月 6 日消息，近日，蔚来汽车创始人、董事长李斌在蔚来九周年内部讲话及财报电话会上透露了多项重要信息，其中最引人注目的莫过于第三品牌萤火虫将于 2025 年正式交付，并预测新能源汽车的市场渗透率将在未来两年内超过 80%。'), Document(page_content='据李斌介绍，蔚来汽车将形成三个品牌矩阵，覆盖从 14 万元到 80 万元的广阔市场区间。其中，第三品牌萤火虫作为蔚来汽车布局中低端市场的重要棋子，将于 2025 年正式交付。这一举措不仅丰富了蔚来的产品线，也进一步提升了其在新能源汽车市场的竞争力。')]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "retriever = vector_store.as_retriever()\n",
    "llm = ChatZhipuAI(\n",
    "    temperature=0.5,\n",
    "    api_key=\"f97b28aa71892d6d515007ef44ee5de6.WvLXD1MsFsGklOZa\",\n",
    "    model=\"glm-4-flash\",\n",
    ")\n",
    "\n",
    "compressor = LLMChainExtractor.from_llm(llm)\n",
    "\n",
    "compression_retriever = ContextualCompressionRetriever(\n",
    "    base_compressor=compressor,\n",
    "    base_retriever=retriever\n",
    ")\n",
    "\n",
    "qa_chain = RetrievalQA.from_chain_type(\n",
    "    llm=llm,\n",
    "    retriever=compression_retriever,\n",
    "    return_source_documents=True\n",
    ")\n",
    "\n",
    "query = \"这篇文档的主要话题是什么？\"\n",
    "result = qa_chain.invoke({\"query\": query})\n",
    "print(result[\"result\"])\n",
    "print(\"Source documents:\", result[\"source_documents\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f42d535-25ad-4966-8e69-45c52963b6e8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c28ab53-4875-48ab-b3e5-351d49ae7e59",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0rc2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
